Data analysis 1

Modeling

library("tidyverse")
library("broom")
library("ggrepel")
library("here")
augmented_table <- read_tsv("../data/03_augmented_table.tsv")
Rows: 2570560 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): GeneFeature, Sample_names, Patient, IDH_status, Localisation, Prim...
dbl  (7): Quantile, CPM, GEP_score, Age, Batch, TLS_status_bin, CPM_log2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
augmented_metadata <- read_tsv("../data/03_augmented_metadata.tsv")
Rows: 64 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): Sample_names, Patient, IDH_status, Localisation, Primary_or_Recurre...
dbl (4): GEP_score, Age, Batch, TLS_status_bin

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#MODEL 1: CPM log2 transformed ~ TLS_status
nested_data <- augmented_table |>
  group_by(GeneFeature) |>
  nest()  |>
  mutate(
    model = map(data, ~lm(CPM_log2 ~ TLS_status_bin, data = .x)),
    tidy_model = map(model, ~tidy(.x, conf.int = TRUE))
  )
gene_results <- nested_data |>
  unnest(tidy_model) |>
  filter(term == "TLS_status_bin") |>
  select(GeneFeature, estimate, p.value, conf.low, conf.high) |>
  mutate(
    q.value = p.adjust(p.value, method = "fdr"),
    significant = q.value < 0.01
  )
#MODEL 2: GEP_score ~ TLS_status
gep_model <- lm(GEP_score ~ TLS_status_bin, data = augmented_metadata)
gep_model_tidy <- tidy(gep_model, conf.int = TRUE)
#Volcano plot for all genes
volcano_plot <- gene_results |>
  drop_na(estimate, p.value, significant, GeneFeature) |>
  mutate(
    neg_log10_p = -log10(p.value),
    label = if_else(significant, GeneFeature, "")
  ) |>
  ggplot(aes(x = estimate, y = neg_log10_p, color = significant)) +
  geom_point(alpha = 0.6) +
  geom_text_repel(aes(label = label), size = 2, max.overlaps = 20) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  labs(
  title = "Volcano Plot of Gene Effects (CPM ~ TLS_status)",
  x = "Effect size (Estimate)",
  y = expression(-log[10](p.value)),
  color = "Significant" ) +
  theme_minimal()
#Print volcano plot
volcano_plot
Warning: ggrepel: 1065 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

ggsave(
  filename = "volcano_plot.png",
  path = here("results"),
  dpi = 300
)
Saving 7 x 5 in image
Warning: ggrepel: 1068 unlabeled data points (too many overlaps). Consider
increasing max.overlaps
#Plotting GEP score vs TLS status
GEP_score_vs_TLS_status <- ggplot(augmented_metadata, aes(
  x = factor(TLS_status_bin),
  y = GEP_score,
  fill = factor(TLS_status_bin)
)) +
  geom_boxplot(width = 0.5, alpha = 0.8, color = "black") +
  scale_x_discrete(labels = c("0" = "No", "1" = "Yes")) +
  geom_jitter(
    aes(color = factor(TLS_status_bin)),
    width = 0.07,
    alpha = 0.6,
    size = 2
  ) +
  scale_fill_manual(
    values = c("0" = "lightsteelblue2", "1" = "indianred3")
  ) +
  scale_color_manual(
    values = c("0" = "lightsteelblue4", "1" = "indianred4")
  ) +
  labs(
    title = "GEP Score by TLS Status",
    x = "TLS Status",
    y = "GEP Score"
  ) +
  theme_classic(base_size = 15) +
  theme(
    legend.position = "none",
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.title = element_text(face = "bold")
  )

GEP_score_vs_TLS_status

ggsave(
  filename = "GEP_score_vs_TLS_status.png",
  path = here("results"),
  dpi = 300
)
Saving 7 x 5 in image
#Checking if genes found significant by TLS status overlap with the 18 signature GEP genes
signature_genes <- c(
  "CCL5", "CD27", "CD274", "CD276", "CD8A", "CMKLR1", "CXCL9",
  "CXCR6", "HLA-DQA1", "HLA-DRB1", "HLA-E", "IDO1", "LAG3",
  "NKG7", "PDCD1LG2", "PSMB10", "STAT1", "TIGIT"
)

tls_sig_genes <- gene_results |>
  filter(significant) |>
  pull(GeneFeature)

overlap_genes <- intersect(signature_genes, tls_sig_genes)

overlap_genes
[1] "CD27"  "CXCR6"
length(overlap_genes)
[1] 2
length(signature_genes)
[1] 18
overlap_table <- tibble(
  Gene = signature_genes,
  In_TLS_Significant = signature_genes %in% tls_sig_genes
)

overlap_table
# A tibble: 18 × 2
   Gene     In_TLS_Significant
   <chr>    <lgl>             
 1 CCL5     FALSE             
 2 CD27     TRUE              
 3 CD274    FALSE             
 4 CD276    FALSE             
 5 CD8A     FALSE             
 6 CMKLR1   FALSE             
 7 CXCL9    FALSE             
 8 CXCR6    TRUE              
 9 HLA-DQA1 FALSE             
10 HLA-DRB1 FALSE             
11 HLA-E    FALSE             
12 IDO1     FALSE             
13 LAG3     FALSE             
14 NKG7     FALSE             
15 PDCD1LG2 FALSE             
16 PSMB10   FALSE             
17 STAT1    FALSE             
18 TIGIT    FALSE             
siginifcant_genes_overlap <- ggplot(overlap_table, aes(
  x = In_TLS_Significant,
  y = Gene,
  color = In_TLS_Significant
)) +
  geom_point(size = 4) +
  scale_x_discrete(labels = c("FALSE" = "No", "TRUE" = "Yes")) +
  scale_color_manual(values = c("FALSE" = "lightsteelblue4", "TRUE" = "indianred3")) +
  labs(
    title = "Which Signature Genes Are TLS-Significant?",
    x = "",
    y = "Gene"
  ) +
  theme_minimal(base_size = 13) +
  theme(
    legend.position = "none",
    panel.grid.major.y = element_line(color = "grey70", linewidth = 0.5),
    panel.grid.major.x = element_line(color = "grey70", linewidth = 0.5),
    panel.grid.minor.y = element_line(color = "grey70", linewidth = 0.3),
    panel.grid.minor.x = element_line(color = "grey70", linewidth = 0.3)
  )


siginifcant_genes_overlap

ggsave(
  filename = "siginificant_genes_overlap.png",
  path = here("results"),
  dpi = 300
)
Saving 7 x 5 in image